library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.6     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.2.0 ──
## ✓ broom        0.8.0     ✓ rsample      0.1.1
## ✓ dials        0.1.1     ✓ tune         0.2.0
## ✓ infer        1.0.0     ✓ workflows    0.2.6
## ✓ modeldata    0.1.1     ✓ workflowsets 0.2.1
## ✓ parsnip      0.2.1     ✓ yardstick    0.0.9
## ✓ recipes      0.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(kknn)
library(tidytext)
library(textrecipes)
library(here)
## here() starts at /Users/amitisraeli/peer_read
library(moderndive)
library(ranger)
library(recipes)
library(rsample)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:kknn':
## 
##     contr.dummy
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
## 
##     lift
library(tidymodels)
library(kknn)
theme_set(theme_minimal())
scotblue <- "#0065BF"
ukred <- "#D00C27"

loading the processed data that have only all the feachers a nd the recomndation score.

data <- read_csv('Data/clean_data.csv')
## New names:
## Rows: 308 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): conference, title, comments dbl (12): ...1, Date, id, IMPACT, SUBSTANCE,
## APPROPRIATENESS, MEANINGFUL_COM...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
colnames(data)
##  [1] "...1"                  "conference"            "Date"                 
##  [4] "title"                 "id"                    "IMPACT"               
##  [7] "SUBSTANCE"             "APPROPRIATENESS"       "MEANINGFUL_COMPARISON"
## [10] "SOUNDNESS_CORRECTNESS" "ORIGINALITY"           "CLARITY"              
## [13] "REVIEWER_CONFIDENCE"   "RECOMMENDATION"        "comments"
numeric_cols <- select_if(data, is.numeric)

drops <- c("id", "index", "Date","...1")
numeric_cols <- numeric_cols[, !(names(numeric_cols) %in% drops)]

data_long <- data %>%
  pivot_longer(colnames(numeric_cols)) %>% 
  as.data.frame()
data_long

Distributions

ggplot(data_long, aes(x = value)) +
  geom_histogram(aes(y=..density..), binwidth = 1) + 
  geom_density(col="#FF0000") +
  geom_vline(aes(xintercept = mean(value)), col="#0096B7", linetype="dashed", size=0.75) +
  facet_wrap(~ name, scales = "free") + 
  labs(x="", y="Density", title="Quick Overview of the aspects",
       subtitle="Histogram for each numeric feature, with density and mean line")

split the data to train and test

set.seed(1234)
data_split <- initial_split(data, strata = RECOMMENDATION)
train <- training(data_split)
test  <- testing(data_split)

set the linear model

RECOMMENDATION_linear <- lm(RECOMMENDATION ~ IMPACT+SUBSTANCE+APPROPRIATENESS+MEANINGFUL_COMPARISON+SOUNDNESS_CORRECTNESS+
ORIGINALITY+CLARITY+REVIEWER_CONFIDENCE,data = train)

histogram of the predicton error

model_points <- get_regression_points(RECOMMENDATION_linear)
ggplot(model_points, aes(x = residual)) +
geom_histogram(bins = 50,color = "#000000", fill = "#0099F8") +
  labs(
    title = "Histogram of Recommendation Predicton of the linear model",
    x = "Recommendation Prediction Error",
    y = "Count"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic")
  )

mean squerd error of the linear model

mean(model_points$residual^2)
## [1] 0.4389945

KNN

knn_mod <- nearest_neighbor(mode="classification", neighbors=5) %>%
  fit(as.factor(RECOMMENDATION) ~ SUBSTANCE + CLARITY + REVIEWER_CONFIDENCE + IMPACT, train)

knn_mod
## parsnip model object
## 
## 
## Call:
## kknn::train.kknn(formula = as.factor(RECOMMENDATION) ~ SUBSTANCE +     CLARITY + REVIEWER_CONFIDENCE + IMPACT, data = data, ks = min_rows(5,     data, 5))
## 
## Type of response variable: nominal
## Minimal misclassification: 0.4913043
## Best kernel: optimal
## Best k: 5

see knn predicton and real labels

knn_pred <- knn_mod %>% predict(test) %>% bind_cols(test %>% select(RECOMMENDATION))
knn_pred

conffesion metrix for knn

knn_pred %>%
  conf_mat(RECOMMENDATION, .pred_class) %>%
  pluck(1) %>%
  as_tibble() %>%
  ggplot(aes(Prediction, Truth, alpha = n)) +
  geom_tile(show.legend = FALSE) +
  geom_text(aes(label = n), colour = "white", alpha = 1, size = 8)+
  labs(
    title = "conffesion metrix of the knn model",
    x = "Predicton",
    y = "Recomndation Score"
  ) 
## Warning in vec2table(truth = truth, estimate = estimate, dnn = dnn, ...):
## `truth` was converted to a factor

accuracy of knn

knn_pred$tf <- if_else(knn_pred$RECOMMENDATION == knn_pred$.pred_class, 1, 0)

sum(knn_pred$tf) / length(knn_pred$RECOMMENDATION)
## [1] 0.474359

Random Forest

rf_mod <- rand_forest(mode="classification") %>%
  fit(as.factor(RECOMMENDATION) ~ SUBSTANCE + CLARITY + REVIEWER_CONFIDENCE + IMPACT, train)

rf_mod
## parsnip model object
## 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      230 
## Number of independent variables:  4 
## Mtry:                             2 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.3510778

see Random Forest predicton and real labels

rf_pred <- rf_mod %>% predict(test) %>% bind_cols(test %>% select(RECOMMENDATION))
rf_pred

conffesion metrix for Random Forest

rf_pred %>%
  conf_mat(RECOMMENDATION, .pred_class) %>%
  pluck(1) %>%
  as_tibble() %>%
  ggplot(aes(Prediction, Truth, alpha = n)) +
  geom_tile(show.legend = FALSE) +
  geom_text(aes(label = n), colour = "white", alpha = 1, size = 8)+
  labs(
    title = "conffesion metrix of the Random Forest model",
    x = "Predicton",
    y = "Recomndation Score"
  ) 
## Warning in vec2table(truth = truth, estimate = estimate, dnn = dnn, ...):
## `truth` was converted to a factor

accuracy of Random Forest

rf_pred$tf <- if_else(rf_pred$RECOMMENDATION == rf_pred$.pred_class, 1, 0)

sum(rf_pred$tf) / length(rf_pred$RECOMMENDATION)
## [1] 0.5769231

Neural Network with one hidden layer and 13 neurons in it

nnet_mod <- mlp(mode="classification",
                hidden_units = 13) %>%
  fit(as.factor(RECOMMENDATION) ~ SUBSTANCE + CLARITY + REVIEWER_CONFIDENCE, train)

nnet_mod
## parsnip model object
## 
## a 3-13-5 network with 122 weights
## inputs: SUBSTANCE CLARITY REVIEWER_CONFIDENCE 
## output(s): as.factor(RECOMMENDATION) 
## options were - softmax modelling

see the Neural Network predicton and real labels

nnet_pred <- nnet_mod %>% predict(test) %>% bind_cols(test %>% select(RECOMMENDATION))
nnet_pred

conffesion metrix for Neural Network

nnet_pred %>%
  conf_mat(RECOMMENDATION, .pred_class) %>%
  pluck(1) %>%
  as_tibble() %>%
  ggplot(aes(Prediction, Truth, alpha = n)) +
  geom_tile(show.legend = FALSE) +
  geom_text(aes(label = n), colour = "white", alpha = 1, size = 8)+
  labs(
    title = "conffesion metrix of the Neural Network model",
    x = "Predicton",
    y = "Recomndation Score"
  ) 
## Warning in vec2table(truth = truth, estimate = estimate, dnn = dnn, ...):
## `truth` was converted to a factor

accuracy of the Neural Network

nnet_pred$tf <- if_else(nnet_pred$RECOMMENDATION == nnet_pred$.pred_class, 1, 0)

sum(nnet_pred$tf) / length(nnet_pred$RECOMMENDATION)
## [1] 0.5641026
mean((as.numeric(nnet_pred$.pred_class) - as.numeric(nnet_pred$RECOMMENDATION))^2)
## [1] 0.8076923